Machine learning project

Authors: Jose Pérez Cano & Álvaro Ribot Barrado

0. Libraries

install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
# LDA/ QDA
library(MASS)

# RDA
library(klaR)

# Multinomial
library(nnet)

# Cross-Validation
library(TunePareto)

# Naive Bayes
library(e1071)

# k-NN
library(class)

# 3d
library(rgl)

# LASSO
library(Matrix)
library(glmnet)
Loading required package: foreach
Loaded glmnet 2.0-16
# Correspondence analysis
library(ca)

1. Read data

set.seed(2105)
setwd("../data")
The working directory was changed to /Users/joseperezcano/Desktop/CFIS segundo curso/AA1/Project/data inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
clev <- read.csv("cleveland.csv", header=F)
hung <- read.csv("hungarian.csv", header=F)
va <- read.csv("long-beach-va.csv", header=F)
switz <- read.csv("switzerland.csv", header=F)

clev$location <- "cleveland"
hung$location <- "hungarian"
va$location <- "long-beach-va"
switz$location <- "switzerland"

heart1 <- rbind(clev, hung)
heart2 <- rbind(va, switz)
heart <- rbind(heart1, heart2)
head(heart)

2. Preprocess data

We apply clustering and several plotting techniques to have an idea of the dataset. In case there are NAs we will use k-NN for imputation.

To extract new features we will apply PCA and FDA and keep this new features and components apart.

# It says which columns are all missings
# The index are returned in negative to eliminate them
na.columns <- function(dd){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (min(dd[,i]) == max(dd[,i]) & min(dd[,i])==-9){
      rmlist <- c(rmlist, i)
    }
  }
  -rmlist
}

clev <- clev[,na.columns(clev)]

# Returns columns with more NA than a given threshold, also in negative
much.na.cols <- function(dd, threshold){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (sum(dd[,i]==-9) > threshold){
      rmlist <- c(rmlist, i)
    }
  }   
  -rmlist  
}

clev <- clev[, much.na.cols(clev, 60)]

# Applies k-nearest neighbour imputation for a given variable
knn.imputation = function (dd, variable, varname, k)
{  
  aux = subset (dd, select = names(dd)[names(dd) != varname])
  aux1 = aux[!is.na(variable),]
  aux2 = aux[is.na(variable),]

  # Neither of aux1, aux2 can contain NAs
  knn.inc = knn (aux1,aux2, variable[!is.na(variable)], k)
  variable[is.na(variable)] = knn.inc
  variable
}

# This are the variables which values where substituted by dummy values.
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- clev[,!(names(clev) %in% dummy)]

# knn imputation for clev
na.names <- names(clev)[-much.na.cols(clev, 0)]
for (name in na.names){
  clev[, name][clev[, name] == -9] <- NA
  clev[, name] <- knn.imputation(clev, clev[,name], name, 7)
}

Now, we analyse the correlations among variables since it will have impact on later models.

corr.factors <- cor(clev)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
    row col
V55  34  12
V57  36  14
V31  21  20
V29  20  21
V20  12  34
V22  14  36
rm.correlated <- c("V57", "V55")
clev <- clev[,!(names(clev) %in% rm.correlated)]

factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
  clev[,f] <- as.factor(clev[,f])
}

# Dummy level gets replaced
clev$V25[clev$V25 == 2] <- 1
clev$V25 <- droplevels(clev$V25)
levels(clev$V25)
[1] "0" "1"

This is the dataset after the treatment for missings.

summary(clev)
       V3        V4      V9           V10        V11    
 Min.   :29.00   0: 91   1: 22   Min.   : 94.0   0:108  
 1st Qu.:48.00   1:191   2: 43   1st Qu.:120.0   1:174  
 Median :55.00           3: 84   Median :130.0          
 Mean   :54.41           4:133   Mean   :131.6          
 3rd Qu.:61.00                   3rd Qu.:140.0          
 Max.   :77.00                   Max.   :200.0          
                                                        
      V12             V14             V15        V16     V18    
 Min.   :126.0   Min.   : 0.00   Min.   : 0.00   0:240   0:107  
 1st Qu.:213.0   1st Qu.: 0.00   1st Qu.: 0.00   1: 42   1:175  
 Median :244.0   Median :10.00   Median :15.00                  
 Mean   :249.1   Mean   :16.64   Mean   :15.08                  
 3rd Qu.:277.0   3rd Qu.:30.00   3rd Qu.:30.00                  
 Max.   :564.0   Max.   :99.00   Max.   :54.00                  
                                                                
 V19          V20          V21      V22     V23     V24    
 0:138   1      :38   4      : 13   81:67   0:271   0:186  
 1:  2   12     :33   15     : 13   82:96   1: 11   1: 95  
 2:142   2      :31   20     : 13   83:86           2:  1  
         8      :30   13     : 12   84:33                  
         6      :26   16     : 12                          
         11     :25   21     : 12                          
         (Other):99   (Other):207                          
 V25     V26     V27          V29              V31        
 0:211   0:252   0:248   Min.   : 1.800   Min.   : 3.000  
 1: 71   1: 30   1: 34   1st Qu.: 6.500   1st Qu.: 7.000  
                         Median : 8.500   Median : 9.000  
                         Mean   : 8.418   Mean   : 9.754  
                         3rd Qu.:10.075   3rd Qu.:12.000  
                         Max.   :15.000   Max.   :18.000  
                                                          
      V32             V33              V34       
 Min.   : 71.0   Min.   : 40.00   Min.   : 84.0  
 1st Qu.:133.2   1st Qu.: 65.00   1st Qu.:154.0  
 Median :153.5   Median : 74.00   Median :168.0  
 Mean   :149.8   Mean   : 75.12   Mean   :168.1  
 3rd Qu.:165.8   3rd Qu.: 84.00   3rd Qu.:184.0  
 Max.   :202.0   Max.   :119.00   Max.   :232.0  
                                                 
      V35              V37         V38     V39    
 Min.   : 26.00   Min.   : 50.00   0:190   0:276  
 1st Qu.: 70.00   1st Qu.: 80.00   1: 92   1:  6  
 Median : 80.00   Median : 85.00                  
 Mean   : 78.74   Mean   : 84.95                  
 3rd Qu.: 85.00   3rd Qu.: 90.00                  
 Max.   :120.00   Max.   :110.00                  
                                                  
      V40        V41          V43             V44         V51    
 Min.   :0.000   1:135   Min.   : 24.0   Min.   :0.0000   1:  2  
 1st Qu.:0.000   2:129   1st Qu.: 92.0   1st Qu.:0.0000   3:159  
 Median :0.800   3: 18   Median :118.0   Median :0.0000   6: 14  
 Mean   :1.027           Mean   :123.6   Mean   :0.6702   7:107  
 3rd Qu.:1.600           3rd Qu.:152.8   3rd Qu.:1.0000          
 Max.   :6.200           Max.   :270.0   Max.   :3.0000          
                                                                 
      V56      V58     V59     V60     V61     V63     V65    
 5      : 14   0:157   1:270   1:242   1:224   1:238   1:236  
 21     : 14   1: 50   2: 12   2: 40   2: 58   2: 44   2: 46  
 14     : 12   2: 31                                          
 1      : 11   3: 32                                          
 17     : 11   4: 12                                          
 30     : 11                                                  
 (Other):209                                                  
 V67     V68    
 1:233   1:246  
 2: 49   2: 36  
                
                
                
                
                

2.1 Visualizations

par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    hist(clev[,i], main = names(clev)[i], xlab="Values")
  }
}

par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    boxplot(clev[,i], xlab = names(clev)[i])
  }
}

par(mfrow = c(3,3))
for(i in 1:ncol(clev)){
  if (is.factor(clev[,i])) {
    hist(as.numeric(as.character(clev[,i])), main = names(clev)[i], xlab="Values")
  }
}

names_num <- c()
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    names_num <- c(names_num, i)
  }
}
clev_numeric <- clev[,names_num]

clev_cor <- cor(clev_numeric)
which(clev_cor > 0.5 & clev_cor < 1, arr.ind = TRUE)
    row col
V34  10   2
V37  12   2
V15   5   4
V14   4   5
V31   7   6
V29   6   7
V32   8   7
V31   7   8
V10   2  10
V37  12  11
V10   2  12
V35  11  12
which(-clev_cor > 0.5 & -clev_cor < 1, arr.ind = TRUE)
     row col

2.2 Modification of values

par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
    qqnorm(clev_numeric[,i], main = c("Q-Q Plot: ", names(clev_numeric)[i]))
    qqline(clev_numeric[,i], col=2)
}

Boxcox

par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
  boxcox(lm(clev_numeric[,i]-min(clev_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(clev_numeric))[i])
}

par(mfrow = c(1,3))
#we treat them as special cases
aux <- clev_numeric[,"V14"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V15"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V40"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))

clev_sqrt <- c("V10", "V12", "V31", "V43")
clev_sqrt_especial <- c("V14", "V40")
clev_box <- clev_numeric
#box-cox transformation
for (i in 1:ncol(clev_box)){
  if (names(clev_box)[i] %in% clev_sqrt) {
    clev_box[,i] <- 2*sqrt(clev_box[,i]-min(clev_box[,i])+1e-6)
  } else if (names(clev_box)[i] %in% clev_sqrt_especial){
    clev_box[,i] <- 2*sqrt(clev_box[,i])
  }
}
clev_box <- data.frame(scale(clev_box, scale = T)) #standarized
names(clev_box) <- names(clev_numeric)
clev_box$V14 <- clev_box$V14 - min(clev_box$V14)
clev_box$V15 <- clev_box$V15 - min(clev_box$V15)
clev_box$V40 <- clev_box$V40 - min(clev_box$V40)
par(mfrow = c(2,3))
for(i in 1:ncol(clev_box)){
    qqnorm(clev_box[,i], main = c("Q-Q Plot: ", names(clev_box)[i]))
    qqline(clev_box[,i], col=2)
}

For further models, normalisation will be useful.

And this is the final dataset after processing it.

clev[, names(clev_box)] <- clev_box[,names(clev_box)]
summary(clev)
       V3        V4      V9           V10        V11    
 Min.   :29.00   0: 91   1: 22   Min.   : 94.0   0:108  
 1st Qu.:48.00   1:191   2: 43   1st Qu.:120.0   1:174  
 Median :55.00           3: 84   Median :130.0          
 Mean   :54.41           4:133   Mean   :131.6          
 3rd Qu.:61.00                   3rd Qu.:140.0          
 Max.   :77.00                   Max.   :200.0          
                                                        
      V12             V14             V15        V16     V18    
 Min.   :126.0   Min.   : 0.00   Min.   : 0.00   0:240   0:107  
 1st Qu.:213.0   1st Qu.: 0.00   1st Qu.: 0.00   1: 42   1:175  
 Median :244.0   Median :10.00   Median :15.00                  
 Mean   :249.1   Mean   :16.64   Mean   :15.08                  
 3rd Qu.:277.0   3rd Qu.:30.00   3rd Qu.:30.00                  
 Max.   :564.0   Max.   :99.00   Max.   :54.00                  
                                                                
 V19          V20          V21      V22     V23     V24    
 0:138   1      :38   4      : 13   81:67   0:271   0:186  
 1:  2   12     :33   15     : 13   82:96   1: 11   1: 95  
 2:142   2      :31   20     : 13   83:86           2:  1  
         8      :30   13     : 12   84:33                  
         6      :26   16     : 12                          
         11     :25   21     : 12                          
         (Other):99   (Other):207                          
 V25     V26     V27          V29              V31        
 0:211   0:252   0:248   Min.   : 1.800   Min.   : 3.000  
 1: 71   1: 30   1: 34   1st Qu.: 6.500   1st Qu.: 7.000  
                         Median : 8.500   Median : 9.000  
                         Mean   : 8.418   Mean   : 9.754  
                         3rd Qu.:10.075   3rd Qu.:12.000  
                         Max.   :15.000   Max.   :18.000  
                                                          
      V32             V33              V34       
 Min.   : 71.0   Min.   : 40.00   Min.   : 84.0  
 1st Qu.:133.2   1st Qu.: 65.00   1st Qu.:154.0  
 Median :153.5   Median : 74.00   Median :168.0  
 Mean   :149.8   Mean   : 75.12   Mean   :168.1  
 3rd Qu.:165.8   3rd Qu.: 84.00   3rd Qu.:184.0  
 Max.   :202.0   Max.   :119.00   Max.   :232.0  
                                                 
      V35              V37         V38     V39    
 Min.   : 26.00   Min.   : 50.00   0:190   0:276  
 1st Qu.: 70.00   1st Qu.: 80.00   1: 92   1:  6  
 Median : 80.00   Median : 85.00                  
 Mean   : 78.74   Mean   : 84.95                  
 3rd Qu.: 85.00   3rd Qu.: 90.00                  
 Max.   :120.00   Max.   :110.00                  
                                                  
      V40        V41          V43             V44         V51    
 Min.   :0.000   1:135   Min.   : 24.0   Min.   :0.0000   1:  2  
 1st Qu.:0.000   2:129   1st Qu.: 92.0   1st Qu.:0.0000   3:159  
 Median :0.800   3: 18   Median :118.0   Median :0.0000   6: 14  
 Mean   :1.027           Mean   :123.6   Mean   :0.6702   7:107  
 3rd Qu.:1.600           3rd Qu.:152.8   3rd Qu.:1.0000          
 Max.   :6.200           Max.   :270.0   Max.   :3.0000          
                                                                 
      V56      V58     V59     V60     V61     V63     V65    
 5      : 14   0:157   1:270   1:242   1:224   1:238   1:236  
 21     : 14   1: 50   2: 12   2: 40   2: 58   2: 44   2: 46  
 14     : 12   2: 31                                          
 1      : 11   3: 32                                          
 17     : 11   4: 12                                          
 30     : 11                                                  
 (Other):209                                                  
 V67     V68    
 1:233   1:246  
 2: 49   2: 36  
                
                
                
                
                

2.3. Feature extraction

Separe train and test data, seed for reproducibility.

set.seed(2000)
n <- nrow(clev)
train.lenght <- round(2*n/3)

clev <- clev[sample(n),]
train <- clev[1:train.lenght,]
test <- clev[(train.lenght+1):n,]
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]

Extract PCA features.

pca <- princomp(train_num)
screeplot(pca)

summary(pca)
Importance of components:
                          Comp.1    Comp.2    Comp.3     Comp.4
Standard deviation     1.8444732 1.5577035 1.3875402 1.21213461
Proportion of Variance 0.2297381 0.1638543 0.1300108 0.09921788
Cumulative Proportion  0.2297381 0.3935924 0.5236032 0.62282104
                           Comp.5     Comp.6     Comp.7    Comp.8
Standard deviation     1.01035405 0.97964763 0.85803949 0.7938423
Proportion of Variance 0.06893431 0.06480791 0.04971676 0.0425556
Cumulative Proportion  0.69175535 0.75656326 0.80628002 0.8488356
                           Comp.9    Comp.10    Comp.11
Standard deviation     0.73133601 0.69242679 0.62465621
Proportion of Variance 0.03611787 0.03237695 0.02634938
Cumulative Proportion  0.88495350 0.91733045 0.94367983
                          Comp.12    Comp.13    Comp.14
Standard deviation     0.57654420 0.50756673 0.43609672
Proportion of Variance 0.02244675 0.01739701 0.01284263
Cumulative Proportion  0.96612658 0.98352359 0.99636622
                           Comp.15
Standard deviation     0.231971913
Proportion of Variance 0.003633784
Cumulative Proportion  1.000000000

Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) * 2

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))

V1, V59, V57 creates many problems

problematic <- c("V57", "V59")
train <- train[, !(names(train) %in% problematic)]
fda <- lda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))

train$LD1 <- loadings[,1]
train$LD2 <- loadings[,2]
train$LD3 <- loadings[,3]
train$LD4 <- loadings[,4]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
test$LD2 <- fda_test$x[,2]
test$LD3 <- fda_test$x[,3]
test$LD4 <- fda_test$x[,4]

Análisis por correspondencias

ac <- mjca(clev[,names(clev) %in% factores], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")


ac_ind <- mjca(train[,names(train) %in% factores], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))


mca.features <- ac_ind$rowcoord

3. Resampling protocol

Principal utility function for doing cross-validation.

First we separate in train and test.

Create the CV function for any model with associated predict and update functions.

cross.validation <- function(data, target, model, times, nfolds, need.class){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- update(model, data=tr)
      pred <- predict(model, newdata=va)
      if (need.class){
        pred <- pred$class
      }
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}

Cross-validation for k-Nearest Neighbour

cross.validation.knn <- function(data, target, times, nfolds, K){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      pred <- knn(tr, va, target[-val], k = K)

      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
  }
  mean(err.total)
}

Cross-validation Naive-Bayes

cross.validation.naive <- function(data, target, model, times, nfolds){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- naiveBayes(V58~.-LD1-LD2-LD3-LD4, data=tr)
      pred <- predict(model, newdata=va)
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}

4. Models

The models we are going to use are: - LDA - QDA - RDA - k-NN - Naïve Bayes - GLM

rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
cross.validation(train, train$V58, rda.model, 10, 10, T)
rda.model.fda <- rda(V58~.,data=train)
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
cross.validation.naive(train, train$V58, naive.model, 10, 10)
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
plot(err, type = "l")

err
 [1] 0.1519883 0.1965205 0.1950585 0.2004678 0.2073977 0.2152924
 [7] 0.2184795 0.2276023 0.2313450 0.2323392 0.2419883 0.2456433
[13] 0.2642398 0.2690351 0.2743275 0.2791228 0.2871053 0.2955556
[19] 0.3003801 0.3050877
cross.validation.knn(train, train$V58, 10, 10, 1)

multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)

Test error

rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
    Pred
True  0  1  2  3  4
   0 40  5  0  0  0
   1  6 13  1  0  2
   2  0  2  6  4  1
   3  0  0  1  9  0
   4  0  0  0  1  3
(err.test <- 1-sum(diag(err.table))/sum(err.table))
[1] 0.2446809

Parte 2

We will now use model on a dataset not yet investigated: Hungary.

hung <- read.table("../data/processed.hungarian.data", header=F, sep=',', na.strings="?")
summary(hung)
       V1              V2               V3              V4       
 Min.   :28.00   Min.   :0.0000   Min.   :1.000   Min.   : 92.0  
 1st Qu.:42.00   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:120.0  
 Median :49.00   Median :1.0000   Median :3.000   Median :130.0  
 Mean   :47.83   Mean   :0.7245   Mean   :2.983   Mean   :132.6  
 3rd Qu.:54.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
 Max.   :66.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
                                                  NA's   :1      
       V5              V6                V7        
 Min.   : 85.0   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:209.0   1st Qu.:0.00000   1st Qu.:0.0000  
 Median :243.0   Median :0.00000   Median :0.0000  
 Mean   :250.8   Mean   :0.06993   Mean   :0.2184  
 3rd Qu.:282.5   3rd Qu.:0.00000   3rd Qu.:0.0000  
 Max.   :603.0   Max.   :1.00000   Max.   :2.0000  
 NA's   :23      NA's   :8         NA's   :1       
       V8              V9              V10        
 Min.   : 82.0   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:122.0   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :140.0   Median :0.0000   Median :0.0000  
 Mean   :139.1   Mean   :0.3038   Mean   :0.5861  
 3rd Qu.:155.0   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :190.0   Max.   :1.0000   Max.   :5.0000  
 NA's   :1       NA's   :1                        
      V11             V12           V13             V14        
 Min.   :1.000   Min.   :0     Min.   :3.000   Min.   :0.0000  
 1st Qu.:2.000   1st Qu.:0     1st Qu.:5.250   1st Qu.:0.0000  
 Median :2.000   Median :0     Median :6.000   Median :0.0000  
 Mean   :1.894   Mean   :0     Mean   :5.643   Mean   :0.3605  
 3rd Qu.:2.000   3rd Qu.:0     3rd Qu.:7.000   3rd Qu.:1.0000  
 Max.   :3.000   Max.   :0     Max.   :7.000   Max.   :1.0000  
 NA's   :190     NA's   :291   NA's   :266                     

Missing values treatment

rm.var <- c("V11", "V12", "V13")
hung <- hung[,!(names(hung) %in% rm.var)]
hung[is.na(hung)] <- -9
var.imputable <- c("V4", "V5", "V6", "V7", "V8", "V9")
for (nom in var.imputable){
  hung[, nom][hung[, nom] == -9] <- NA
  variable <- hung[, nom]
  hung[, nom] <- knn.imputation(hung, variable, nom, 7)
}
var.factor <- c("V2", "V3", "V6", "V7", "V9", "V14")
for (nom in var.factor){
  hung[, nom] <- as.factor(as.character(hung[,nom]))
}
summary(hung)
       V1        V2      V3            V4              V5       
 Min.   :28.00   0: 81   1: 11   Min.   : 27.0   Min.   :  4.0  
 1st Qu.:42.00   1:213   2:106   1st Qu.:120.0   1st Qu.:198.0  
 Median :49.00           3: 54   Median :130.0   Median :237.0  
 Mean   :47.83           4:123   Mean   :132.2   Mean   :236.6  
 3rd Qu.:54.00                   3rd Qu.:140.0   3rd Qu.:277.0  
 Max.   :66.00                   Max.   :200.0   Max.   :603.0  
 V6      V7            V8        V9           V10         V14    
 0:266   0:235   Min.   : 54.0   0:204   Min.   :0.0000   0:188  
 1: 28   1: 53   1st Qu.:122.0   1: 89   1st Qu.:0.0000   1:106  
         2:  6   Median :140.0   2:  1   Median :0.0000          
                 Mean   :138.8           Mean   :0.5861          
                 3rd Qu.:155.0           3rd Qu.:1.0000          
                 Max.   :190.0           Max.   :5.0000          

Gaussianity treatment

names_num <- c()
for(i in 1:ncol(hung)){
  if (!is.factor(hung[,i])) {
    names_num <- c(names_num, i)
  }
}
hung_numeric <- hung[,names_num]

hung_cor <- cor(hung_numeric)
which(hung_cor > 0.5 & hung_cor < 1, arr.ind = TRUE)
     row col
which(-hung_cor > 0.5 & -hung_cor < 1, arr.ind = TRUE)
     row col
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
    qqnorm(hung_numeric[,i], main = c("Q-Q Plot: ", names(hung_numeric)[i]))
    qqline(hung_numeric[,i], col=2)
}

Boxcox

par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
  boxcox(lm(hung_numeric[,i]-min(hung_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(hung_numeric))[i])
}

We see var 10 requires a logaritmic transformation.

qqnorm(hung[,"V10"])
qqline(hung[,"V10"], col=2)

Feature extraction

Separe train and test data, seed for reproducibility.

set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)

hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]

Extract PCA features.

pca <- princomp(train_num)
screeplot(pca)

summary(pca)
Importance of components:
                           Comp.1      Comp.2      Comp.3
Standard deviation     86.0161788 24.47912423 17.80405417
Proportion of Variance  0.8849996  0.07167612  0.03791583
Cumulative Proportion   0.8849996  0.95667568  0.99459151
                            Comp.4       Comp.5
Standard deviation     6.709559731 0.4448637716
Proportion of Variance 0.005384815 0.0000236721
Cumulative Proportion  0.999976328 1.0000000000

Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) *0.3

col.class <- as.numeric(train$V14)
col.class[col.class==0] <- "red"
col.class[col.class==1] <- "blue"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))

fda <- lda(V14~., data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))

train$LD1 <- loadings[,1]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]

Análisis por correspondencias

ac <- mjca(hung[,names(hung) %in% var.factor], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")


ac_ind <- mjca(train[,names(train) %in% var.factor], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))


mca.features <- ac_ind$rowcoord

---
title: "R Notebook"
output: html_notebook
---


# Machine learning project
## Authors: Jose Pérez Cano & Álvaro Ribot Barrado

### 0. Libraries

```{r}
install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
```

```{r}
# LDA/ QDA
library(MASS)

# RDA
library(klaR)

# Multinomial
library(nnet)

# Cross-Validation
library(TunePareto)

# Naive Bayes
library(e1071)

# k-NN
library(class)

# 3d
library(rgl)

# LASSO
library(Matrix)
library(glmnet)

# Correspondence analysis
library(ca)
```


### 1. Read data

```{r}
set.seed(2105)
setwd("../data")
clev <- read.csv("cleveland.csv", header=F)
hung <- read.csv("hungarian.csv", header=F)
va <- read.csv("long-beach-va.csv", header=F)
switz <- read.csv("switzerland.csv", header=F)

clev$location <- "cleveland"
hung$location <- "hungarian"
va$location <- "long-beach-va"
switz$location <- "switzerland"

heart1 <- rbind(clev, hung)
heart2 <- rbind(va, switz)
heart <- rbind(heart1, heart2)
head(heart)
```


### 2. Preprocess data
We apply clustering and several plotting techniques to have an idea of the dataset. In case there are NAs we will use k-NN for imputation. 

To extract new features we will apply PCA and FDA and keep this new features and components apart.

```{r}
# It says which columns are all missings
# The index are returned in negative to eliminate them
na.columns <- function(dd){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (min(dd[,i]) == max(dd[,i]) & min(dd[,i])==-9){
      rmlist <- c(rmlist, i)
    }
  }
  -rmlist
}

clev <- clev[,na.columns(clev)]

# Returns columns with more NA than a given threshold, also in negative
much.na.cols <- function(dd, threshold){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (sum(dd[,i]==-9) > threshold){
      rmlist <- c(rmlist, i)
    }
  }   
  -rmlist  
}

clev <- clev[, much.na.cols(clev, 60)]

# Applies k-nearest neighbour imputation for a given variable
knn.imputation = function (dd, variable, varname, k)
{  
  aux = subset (dd, select = names(dd)[names(dd) != varname])
  aux1 = aux[!is.na(variable),]
  aux2 = aux[is.na(variable),]

  # Neither of aux1, aux2 can contain NAs
  knn.inc = knn (aux1,aux2, variable[!is.na(variable)], k)
  variable[is.na(variable)] = knn.inc
  variable
}

# This are the variables which values where substituted by dummy values.
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- clev[,!(names(clev) %in% dummy)]

# knn imputation for clev
na.names <- names(clev)[-much.na.cols(clev, 0)]
for (name in na.names){
  clev[, name][clev[, name] == -9] <- NA
  clev[, name] <- knn.imputation(clev, clev[,name], name, 7)
}
```

Now, we analyse the correlations among variables since it will have impact on later models.

```{r}
corr.factors <- cor(clev)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)

rm.correlated <- c("V57", "V55")
clev <- clev[,!(names(clev) %in% rm.correlated)]

factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
  clev[,f] <- as.factor(clev[,f])
}

# Dummy level gets replaced
clev$V25[clev$V25 == 2] <- 1
clev$V25 <- droplevels(clev$V25)
levels(clev$V25)
```

This is the dataset after the treatment for missings.

```{r}
summary(clev)
```


#### 2.1 Visualizations

```{r}
par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    hist(clev[,i], main = names(clev)[i], xlab="Values")
  }
}
```


```{r}
par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    boxplot(clev[,i], xlab = names(clev)[i])
  }
}
```


```{r}
par(mfrow = c(3,3))
for(i in 1:ncol(clev)){
  if (is.factor(clev[,i])) {
    hist(as.numeric(as.character(clev[,i])), main = names(clev)[i], xlab="Values")
  }
}
```


```{r}
names_num <- c()
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    names_num <- c(names_num, i)
  }
}
clev_numeric <- clev[,names_num]

clev_cor <- cor(clev_numeric)
which(clev_cor > 0.5 & clev_cor < 1, arr.ind = TRUE)
which(-clev_cor > 0.5 & -clev_cor < 1, arr.ind = TRUE)
```


#### 2.2 Modification of values

```{r}
par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
    qqnorm(clev_numeric[,i], main = c("Q-Q Plot: ", names(clev_numeric)[i]))
    qqline(clev_numeric[,i], col=2)
}
```


Boxcox

```{r}
par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
  boxcox(lm(clev_numeric[,i]-min(clev_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(clev_numeric))[i])
}
```

```{r}
par(mfrow = c(1,3))
#we treat them as special cases
aux <- clev_numeric[,"V14"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V15"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V40"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
```


```{r}
clev_sqrt <- c("V10", "V12", "V31", "V43")
clev_sqrt_especial <- c("V14", "V40")
clev_box <- clev_numeric
#box-cox transformation
for (i in 1:ncol(clev_box)){
  if (names(clev_box)[i] %in% clev_sqrt) {
    clev_box[,i] <- 2*sqrt(clev_box[,i]-min(clev_box[,i])+1e-6)
  } else if (names(clev_box)[i] %in% clev_sqrt_especial){
    clev_box[,i] <- 2*sqrt(clev_box[,i])
  }
}
clev_box <- data.frame(scale(clev_box, scale = T)) #standarized
names(clev_box) <- names(clev_numeric)
clev_box$V14 <- clev_box$V14 - min(clev_box$V14)
clev_box$V15 <- clev_box$V15 - min(clev_box$V15)
clev_box$V40 <- clev_box$V40 - min(clev_box$V40)
```


```{r}
par(mfrow = c(2,3))
for(i in 1:ncol(clev_box)){
    qqnorm(clev_box[,i], main = c("Q-Q Plot: ", names(clev_box)[i]))
    qqline(clev_box[,i], col=2)
}
```

For further models, normalisation will be useful.

```{r}
clev_box <- apply(clev_box, 2, scale)
```


And this is the final dataset after processing it.

```{r}
clev[, names(clev_box)] <- clev_box[,names(clev_box)]
summary(clev)
```


#### 2.3. Feature extraction

Separe train and test data, seed for reproducibility.

```{r}
set.seed(2000)
n <- nrow(clev)
train.lenght <- round(2*n/3)

clev <- clev[sample(n),]
train <- clev[1:train.lenght,]
test <- clev[(train.lenght+1):n,]
```

```{r}
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]
```

Extract PCA features.

```{r}
pca <- princomp(train_num)
screeplot(pca)
summary(pca)
```


```{r}
biplot(pca)
```

```{r}
Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) * 2

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))
```


V1, V59, V57 creates many problems

```{r}
problematic <- c("V57", "V59")
train <- train[, !(names(train) %in% problematic)]
fda <- lda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))
```


```{r}
train$LD1 <- loadings[,1]
train$LD2 <- loadings[,2]
train$LD3 <- loadings[,3]
train$LD4 <- loadings[,4]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
test$LD2 <- fda_test$x[,2]
test$LD3 <- fda_test$x[,3]
test$LD4 <- fda_test$x[,4]
```


Análisis por correspondencias

```{r}
ac <- mjca(clev[,names(clev) %in% factores], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")

ac_ind <- mjca(train[,names(train) %in% factores], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))

mca.features <- ac_ind$rowcoord
```


### 3. Resampling protocol
Principal utility function for doing cross-validation. 

First we separate in train and test.

Create the CV function for any model with associated predict and update functions.

```{r}
cross.validation <- function(data, target, model, times, nfolds, need.class){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- update(model, data=tr)
      pred <- predict(model, newdata=va)
      if (need.class){
        pred <- pred$class
      }
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}
```


Cross-validation for k-Nearest Neighbour

```{r}
cross.validation.knn <- function(data, target, times, nfolds, K){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      pred <- knn(tr, va, target[-val], k = K)

      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
  }
  mean(err.total)
}
```


Cross-validation Naive-Bayes

```{r}
cross.validation.naive <- function(data, target, model, times, nfolds){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- naiveBayes(V58~.-LD1-LD2-LD3-LD4, data=tr)
      pred <- predict(model, newdata=va)
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}
```


### 4. Models
The models we are going to use are: 
  - LDA
  - QDA
  - RDA
  - k-NN
  - Naïve Bayes
  - GLM
 

```{r}
rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
```


```{r}
cross.validation(train, train$V58, rda.model, 10, 10, T)
```


```{r}
rda.model.fda <- rda(V58~.,data=train)
```


```{r}
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
```


```{r}
cross.validation.naive(train, train$V58, naive.model, 10, 10)
```


```{r}
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
```


```{r}
plot(err, type = "l")
err
```


```{r}
cross.validation.knn(train, train$V58, 10, 10, 1)

multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
```


## Test error

```{r}
rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
(err.test <- 1-sum(diag(err.table))/sum(err.table))
```

# Parte 2

We will now use model on a dataset not yet investigated: Hungary.

```{r}
hung <- read.table("../data/processed.hungarian.data", header=F, sep=',', na.strings="?")
summary(hung)
```

## Missing values treatment

```{r}
rm.var <- c("V11", "V12", "V13")
hung <- hung[,!(names(hung) %in% rm.var)]
```

```{r}
hung[is.na(hung)] <- -9
var.imputable <- c("V4", "V5", "V6", "V7", "V8", "V9")
for (nom in var.imputable){
  hung[, nom][hung[, nom] == -9] <- NA
  variable <- hung[, nom]
  hung[, nom] <- knn.imputation(hung, variable, nom, 7)
}
```

```{r}
var.factor <- c("V2", "V3", "V6", "V7", "V9", "V14")
for (nom in var.factor){
  hung[, nom] <- as.factor(as.character(hung[,nom]))
}

# Level 2 has few values
hung$V7[hung$V7 == 2] <- 1
hung$V7 <- droplevels(hung$V7)
levels(hung$V7)

# Level 2 has one anomal value
hung$V9[hung$V9 == 2] <- 1
hung$V9 <- droplevels(hung$V9)
levels(hung$V9)
```


```{r}
summary(hung)
```

## Gaussianity treatment

```{r}
names_num <- c()
for(i in 1:ncol(hung)){
  if (!is.factor(hung[,i])) {
    names_num <- c(names_num, i)
  }
}
hung_numeric <- hung[,names_num]

hung_cor <- cor(hung_numeric)
which(hung_cor > 0.5 & hung_cor < 1, arr.ind = TRUE)
which(-hung_cor > 0.5 & -hung_cor < 1, arr.ind = TRUE)
```


```{r}
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
    qqnorm(hung_numeric[,i], main = c("Q-Q Plot: ", names(hung_numeric)[i]))
    qqline(hung_numeric[,i], col=2)
}
```


Boxcox

```{r}
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
  boxcox(lm(hung_numeric[,i]-min(hung_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(hung_numeric))[i])
}
```

We see var 10 requires a logaritmic transformation.

```{r}
hung[,"V10"] <- log(1+hung[,"V10"])
``` 

```{r}
qqnorm(hung[,"V10"])
qqline(hung[,"V10"], col=2)
```

## Feature extraction

Separe train and test data, seed for reproducibility.

```{r}
set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)

hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]
```

```{r}
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]
```

Extract PCA features.

```{r}
pca <- princomp(train_num)
screeplot(pca)
summary(pca)
```


```{r}
biplot(pca)
```

```{r}
Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) *0.3

col.class <- as.numeric(train$V14)
col.class[col.class==0] <- "red"
col.class[col.class==1] <- "blue"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
```

```{r}
fda <- lda(V14~., data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
```


```{r}
train$LD1 <- loadings[,1]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
```


Análisis por correspondencias

```{r}
ac <- mjca(hung[,names(hung) %in% var.factor], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")

ac_ind <- mjca(train[,names(train) %in% var.factor], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))

mca.features <- ac_ind$rowcoord
```

## 
